/************************************************
* Copyright (c) 2003 Michael Cafarella
***********************************************/
package net.nutch.quality;
import java.io.*;
import java.util.*;
import net.nutch.html.*;
import net.nutch.searcher.*;
/*****************************************************
* The ResultTestTool lets us test the quality of our
* search engine. It uses a list of queries and runs
* them against Nutch. It then runs the same list of
* queries against some authoritative source, the results
* of which are found in a flat file. This source might
* be hand or machine generated - this tool just needs
* a list of results.
*
* We compute various stats based on how Nutch compares.
*
* This lets us tell, roughly, how much of a difference
* our improvements make.
*
* @author Mike Cafarella
*****************************************************/
public class ResultTestTool {
//
// Interfaces and inner classes for the different
// search result sources
//
/**
* This interface provides simple access to a search engine's
* query/result set. It returns a Vector of URL Strings
* that represent the top hits against the engine. All
* search engines we're interested in can implement this
* basic interface.
*/
interface SearchEngine {
public Vector search(String query, int maxResults) throws IOException;
}
/**
* Implement the SearchEngine interface with our Nutch
* system. We create a LuceneSegmentSearcher out of some
* given segments, and query it.
*/
class NutchEngine implements SearchEngine {
NutchBean searcher;
/**
* Give the location of the segments dir.
*/
public NutchEngine(String dir) throws IOException {
searcher = new NutchBean(new File(dir));
}
/**
* Search for the given term and return no more than
* maxResults URL Strings in the Vector.
*/
public Vector search(String queryStr, int maxResults) throws IOException {
Vector results = new Vector();
Query query = Query.parse(queryStr);
Hits hits = searcher.search(query, maxResults);
long max = Math.min(hits.getTotal(), maxResults);
for (int i = 0; i < max; i++) {
HitDetails details = searcher.getDetails(hits.getHit(i));
results.add(details.getValue("url"));
}
return results;
}
}
/**
* Implement the SearchEngine interface for a different
* system. For now this is just a flat file of results,
* not a dynamic search.
*/
class ResultsList implements SearchEngine {
Hashtable resultTable = new Hashtable();
/**
* Load in a results list. We will compare queries
* against this flat list.
*/
public ResultsList(File resultsList) throws IOException {
DataInputStream in = new DataInputStream(new FileInputStream(resultsList));
try {
int numQueries = in.readInt();
System.out.println("Number queries: " + numQueries);
for (int i = 0; i < numQueries; i++) {
String curQuery = in.readUTF();
int numResults = in.readInt();
if (verbose) {
System.out.println("For " + curQuery + ": " + numResults);
}
// Extract all the results
Vector resultList = new Vector();
for (int j = 0; j < numResults; j++) {
String str = in.readUTF();
resultList.add(str);
}
resultTable.put(curQuery, resultList);
}
} finally {
in.close();
}
}
/**
* Grab a set of search results from the table
*/
public Vector search(String queryStr, int maxResults) {
Vector results = new Vector();
Vector hits = (Vector) resultTable.get(queryStr);
if (hits != null) {
for (Enumeration e = hits.elements(); e.hasMoreElements() && maxResults > 0; maxResults--) {
results.add(e.nextElement());
}
}
return results;
}
}
//
// Interfaces and Inner classes for measuring quality
//
/**
* QualityMetric computes a single value for many calls to
* computeMetric().
*/
interface QualityMetric {
public void computeMetric(String query, Vector testResults, Vector answerResults);
public double getScore();
public long scoredPoints();
public long maxPoints();
public String getName();
}
/**
* The PerfectPage metric works as follows:
*
* For the purposes of our metric, we assume that the first answer
* given in "answerResults" is the "Perfect Page" for that query.
*
* If we find the PP within the first topChunk of testResults, then we
* give a point.
*
* If we find a page from the PP's domain within the first topChunk of
* testResults, then we give a half-point. (Not yet implemented!)
*
* If there are no results from answerResults, it's a no-op.
*
* Scores are computed across many queries. We divide the
* actual points by possible points, and give a score
* between 0 and 1.0.
*
*/
class PerfectPageMetric implements QualityMetric {
long points = 0, possiblePoints = 0;
/**
* The PerfectPageMetric takes the best result from answerResults.
* If it's found in testResults, we award a point.
* Soon, we will award a half-point for getting the domain right.
*/
public void computeMetric(String query, Vector testResults, Vector answerResults) {
// Get the best result
if (answerResults != null && answerResults.size() > 0) {
possiblePoints++;
String perfectPage = (String) answerResults.elementAt(0);
// Look for it in the test set
if (testResults != null) {
if (verbose) {
System.out.println("PerfectPage: " + perfectPage);
}
for (Enumeration e = testResults.elements(); e.hasMoreElements(); ) {
String curTest = (String) e.nextElement();
if (curTest.equals(perfectPage)) {
points++;
if (verbose) {
System.out.println(" MATCHED: " + curTest);
}
break;
} else {
if (verbose) {
System.out.println(" failed: " + curTest);
}
}
}
}
}
}
/**
*/
public double getScore() {
return points / (possiblePoints * 1.0);
}
public long scoredPoints() {
return points;
}
public long maxPoints() {
return possiblePoints;
}
/**
*/
public String getName() {
return "PerfectPage";
}
}
/**
* The GoodEnough metric works as follows:
*
* Take both testResults and answerResults.
*
* For every URL in testResults that also appears in answerResults,
* we award a point.
*
* We divide the actual points by the possible points, and give
* a score between 0 and 1.0.
*/
class GoodEnoughMetric implements QualityMetric {
long points = 0, possiblePoints = 0;
/**
* The GoodEnoughMetric looks for each answer in the given test set.
* Every time it's present, we award a point.
*/
public void computeMetric(String query, Vector testResults, Vector answerResults) {
// Go through all the answers
if (answerResults != null && answerResults.size() > 0) {
possiblePoints += Math.min(answerResults.size(), topChunk);
if (testResults != null) {
int count = 0;
for (Enumeration e = testResults.elements(); e.hasMoreElements() && count < topChunk; count++) {
String testItem = (String) e.nextElement();
// Does the testItem appear in the answers?
int count2 = 0;
for (Enumeration e2 = answerResults.elements(); e2.hasMoreElements() && count2 < topChunk; count2++) {
String answer = (String) e2.nextElement();
if (testItem.equals(answer)) {
points++;
}
}
}
}
}
}
/**
* Get the score, normalized to 0 .. 1.0
*/
public double getScore() {
return points / (possiblePoints * 1.0);
}
public long scoredPoints() {
return points;
}
public long maxPoints() {
return possiblePoints;
}
/**
*/
public String getName() {
return "GoodEnough";
}
}
//
// ResultTestTool members
//
SearchEngine testEngine = null, answerEngine = null;
boolean verbose = false;
int topChunk = 0;
/**
* Build ResultTestTool
*/
public ResultTestTool(String segments, String results, boolean verbose, int topChunk) throws IOException {
testEngine = new NutchEngine(segments);
answerEngine = new ResultsList(new File(results));
this.verbose = verbose;
this.topChunk = topChunk;
}
/**
* Run testQueries with all the metrics we know about.
*/
public void testAllMetrics(File queryFile) throws IOException {
// Build the metrics
QualityMetric metrics[] = new QualityMetric[2];
metrics[0] = new PerfectPageMetric();
metrics[1] = new GoodEnoughMetric();
// Run our long test suite
System.out.println("Running test suite");
testQueries(queryFile, metrics);
// Emit the results
System.out.println("Metric Results");
System.out.println("-------------------------------");
for (int i = 0; i < metrics.length; i++) {
System.out.println(metrics[i].getName() + ": " + metrics[i].scoredPoints() + " of " + metrics[i].maxPoints() + " (" + metrics[i].getScore() + ")");
}
}
/**
* Run a battery of tests against the Nutch search engine.
* We also run the tests against the otherEngine. We then
* compute a number based on the test.
*/
public void testQueries(File queryFile, QualityMetric metrics[]) throws IOException {
BufferedReader reader = new BufferedReader(new FileReader(queryFile));
try {
String queryStr = null;
while ((queryStr = reader.readLine()) != null) {
queryStr = queryStr.trim();
// First, execute our own search
Vector testResults = testEngine.search(queryStr, topChunk);
// Second, search against other results
Vector answerResults = answerEngine.search(queryStr, topChunk);
// Compute stats
if (verbose) {
System.out.println("Running test on " + queryStr);
}
for (int i = 0; i < metrics.length; i++) {
metrics[i].computeMetric(queryStr, testResults, answerResults);
}
}
} finally {
reader.close();
}
}
/**
* Run the ResultTestTool
*/
public static void main(String argv[]) throws IOException {
if (argv.length < 3) {
System.out.println("Usage: java net.nutch.quality.ResultTestTool <segments> <resultSet> <queryList> [-verbose] [-topChunk chunkSize]");
return;
}
boolean verbose = false;
int topChunk = 10;
for (int i = 3; i < argv.length; i++) {
if ("-verbose".equals(argv[i])) {
verbose = true;
}
if ("-topChunk".equals(argv[i])) {
topChunk = Integer.parseInt(argv[i + 1]);
i++;
}
}
ResultTestTool rtt = new ResultTestTool(argv[0], argv[1], verbose, topChunk);
rtt.testAllMetrics(new File(argv[2]));
}
}